Scrape

https://rpubs.com/uky994/578161

https://rstudio-pubs-static.s3.amazonaws.com/299685_5ce4f9fb6fa3476e98fad355623a5f1e.html

Ratings 6.0 to 10.0

IMDB <- data.frame()

# https://www.imdb.com/search/title/?title_type=feature&user_rating=6.0,10.0&num_votes=20000,&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&countries=us&languages=en&sort=user_rating,desc&start=1&ref_=adv_nxt 

  for (each_page in seq(from = 1, to = 3625, by=50)) {
      link = paste0("https://www.imdb.com/search/title/?title_type=feature&user_rating=6.0,10.0&num_votes=20000,&certificates=US%3AG,US%3APG,US%3APG-13,US%3AR,US%3ANC-17&countries=us&languages=en&sort=user_rating,desc&start=",each_page,"&ref_=adv_nxt")
      page <- read_html(link)
      RK <- page %>% html_nodes(".text-primary") %>% html_text()
      Title <- page %>% html_nodes(".lister-item-header a") %>% html_text()
      Year <- page %>% html_nodes(".text-muted.unbold") %>% html_text()
      Cert <- html_node(html_nodes(page,".lister-item-content"), ".certificate") %>% html_text()
      Runtime <- html_node(html_nodes(page,".lister-item-content"), ".runtime") %>% html_text()
      Genre <- html_node(html_nodes(page,".lister-item-content"), ".genre") %>% html_text()
      Rating <- html_node(html_nodes(page,".lister-item-content"), ".ratings-imdb-rating strong") %>% html_text()
      Meta <- html_node(html_nodes(page,".lister-item-content"), ".favorable") %>% html_text()
      Votes <- html_node(html_nodes(page,".lister-item-content"), ".sort-num_votes-visible span:nth-child(2)") %>% html_text()
      Gross <- html_node(html_nodes(page,".lister-item-content"), ".ghost~ .text-muted+ span") %>% html_text()
      Summary <- html_node(html_nodes(page,".lister-item-content"), ".ratings-bar+ .text-muted") %>% html_text()
      Director <- html_node(html_nodes(page,".lister-item-content"), ".text-muted+ p a:nth-child(1)") %>% html_text()
      Star1 <- html_node(html_nodes(page,".lister-item-content"), ".ghost+ a") %>% html_text()
      print(each_page)
      IMDB <- rbind(IMDB,data.frame(RK,Title,Year,Runtime,Cert,Genre,Rating,Meta,Votes,Gross,Summary,Director,Star1))
  }
## [1] 1
## [1] 51
## [1] 101
## [1] 151
## [1] 201
## [1] 251
## [1] 301
## [1] 351
## [1] 401
## [1] 451
## [1] 501
## [1] 551
## [1] 601
## [1] 651
## [1] 701
## [1] 751
## [1] 801
## [1] 851
## [1] 901
## [1] 951
## [1] 1001
## [1] 1051
## [1] 1101
## [1] 1151
## [1] 1201
## [1] 1251
## [1] 1301
## [1] 1351
## [1] 1401
## [1] 1451
## [1] 1501
## [1] 1551
## [1] 1601
## [1] 1651
## [1] 1701
## [1] 1751
## [1] 1801
## [1] 1851
## [1] 1901
## [1] 1951
## [1] 2001
## [1] 2051
## [1] 2101
## [1] 2151
## [1] 2201
## [1] 2251
## [1] 2301
## [1] 2351
## [1] 2401
## [1] 2451
## [1] 2501
## [1] 2551
## [1] 2601
## [1] 2651
## [1] 2701
## [1] 2751
## [1] 2801
## [1] 2851
## [1] 2901
## [1] 2951
## [1] 3001
## [1] 3051
## [1] 3101
## [1] 3151
## [1] 3201
## [1] 3251
## [1] 3301
## [1] 3351
## [1] 3401
## [1] 3451
## [1] 3501
## [1] 3551
## [1] 3601
datatable(IMDB, rownames = FALSE, extensions = 'Responsive')
save(IMDB,file = "IMDB.rda")
#write.csv(ESPN_BPI,"ESPN_BPI.csv")

Clean up

# Get the dot off the rank
IMDB$RK <- as.integer(rownames(IMDB))

# Get rid of the parentheses of the year
IMDB$Year <- gsub("\\(I)","",IMDB$Year)
IMDB$Year <- gsub("\\(","",IMDB$Year)
IMDB$Year <- gsub("\\)","",IMDB$Year)
IMDB$Year <- as.integer(IMDB$Year)

# Drop min off of runtime

IMDB$Runtime <- gsub(" min","",IMDB$Runtime)
IMDB$Runtime <- as.integer(IMDB$Runtime)

IMDB$Rating <- as.numeric(IMDB$Rating)
IMDB$Meta <- as.numeric(IMDB$Meta)

IMDB$Votes <- as.integer(gsub("\\,","",IMDB$Votes))

IMDB$Gross <- gsub("\\$","",IMDB$Gross)
IMDB$Gross <- gsub("M","",IMDB$Gross)
IMDB$Gross <- as.numeric(IMDB$Gross)*1000000
save(IMDB,file = "IMDB3.rda")
boxplot(IMDB$Rating)

pacman::p_load(plotly)
load("IMDB3625.rda")
p <- IMDB %>% filter(Rating>=8) %>%
  ggplot( aes(Gross, Meta, size = Rating, color=Cert,text=Title)) +
  geom_point() + theme_bw() 

ggplotly(p)